library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(class)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
library(dplyr)
library(ggthemes)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggExtra)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(scico)
library(stringr)
library(naniar)
BeerData = data.frame(read.csv("C:/SMU MS Data Science/DoingDataScience6306/MSDS_6306_Doing-Data-Science/Unit 8 and 9 Case Study 1/Beers.csv"))
BreweriesData = data.frame(read.csv("C:/SMU MS Data Science/DoingDataScience6306/MSDS_6306_Doing-Data-Science/Unit 8 and 9 Case Study 1/Breweries.csv"))
#BeerData = data.frame(read.csv("~/SMU/Term 1/Doing Data Science/MSDS_6306_Doing-Data-Science-Master/Unit 8 and 9 Case Study 1/Beers.csv"))
stock_data = BeerData #this is for the gg_miss_var function
#BreweriesData = data.frame(read.csv("~/SMU/Term 1/Doing Data Science/MSDS_6306_Doing-Data-Science-Master/Unit 8 and 9 Case Study 1/Breweries.csv"))
# Tidy data
#Set state as a factor
BreweriesData$State = as.factor(BreweriesData$State)
BeerData$Style = as.factor(BeerData$Style)
# Remove (year) at the end of the names of beers
BeerData$Name = str_remove(BeerData$Name, "([:space:]|)\\(\\d\\d\\d\\d\\)")
# Remove white space at the start and end of beer names
BeerData$Name = str_trim(BeerData$Name)
#Shows how many duplicates of each beer, 240 unwanted duplicates in total
#BeerData %>% count(Name) %>% ungroup() %>% arrange(desc(n))
# Remove beers with the same name, removes 240 unwanted duplicates
BeerData = BeerData %>% group_by(Name) %>% filter(!duplicated(Name))
# Manually add in missing data for beers for question/item #3
# Parsing through the data, we can see that a large amount of information from the IBV and ABV values of beer are missing.
# While it is possible to hand-fill out values for both, counting the total number of missing values over both groups add up to over 1000 missing values.
# For the most part, we can ignore the missing data, but we cannot prove that the missing data is truly random.
# While we can perform an analysis on the data, we must be cautious as to the bias the exclusion of missing data could apply to our results.
# After googling various beers without IBU's and looking for the missing value it seems like many are hard to find
BeerData[BeerData$Name == "Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale",]$IBU = 35
BeerData[BeerData$Name == "Pub Beer",]$IBU = 18
BeerData[BeerData$Name == "Sinister",]$IBU = 30
#gg_miss_var(BeerData) Did not work as intended, might use to visualize missing values, might not be necessary
gg_miss_var(stock_data)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

# Merge beer data with breweries data for question/item #2
names(BreweriesData)[1] = "Brewery_id"
mergedBeerData = merge(BeerData, BreweriesData, by = "Brewery_id")
head(mergedBeerData, 6)
## Brewery_id Name.x Beer_ID ABV IBU
## 1 1 Get Together 2692 0.045 50
## 2 1 Maggie's Leap 2691 0.049 26
## 3 1 Wall's End 2690 0.048 19
## 4 1 Pumpion 2689 0.060 38
## 5 1 Stronghold 2688 0.060 25
## 6 1 Parapet ESB 2687 0.056 47
## Style Ounces Name.y City
## 1 American IPA 16 NorthGate Brewing Minneapolis
## 2 Milk / Sweet Stout 16 NorthGate Brewing Minneapolis
## 3 English Brown Ale 16 NorthGate Brewing Minneapolis
## 4 Pumpkin Ale 16 NorthGate Brewing Minneapolis
## 5 American Porter 16 NorthGate Brewing Minneapolis
## 6 Extra Special / Strong Bitter (ESB) 16 NorthGate Brewing Minneapolis
## State
## 1 MN
## 2 MN
## 3 MN
## 4 MN
## 5 MN
## 6 MN
tail(mergedBeerData, 6)
## Brewery_id Name.x Beer_ID ABV IBU
## 2165 556 Pilsner Ukiah 98 0.055 NA
## 2166 557 Heinnieweisse Weissebier 52 0.049 NA
## 2167 557 Snapperhead IPA 51 0.068 NA
## 2168 557 Moo Thunder Stout 50 0.049 NA
## 2169 557 Porkslap Pale Ale 49 0.043 NA
## 2170 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Name.y City
## 2165 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2166 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2167 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2168 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2169 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2170 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2165 CA
## 2166 NY
## 2167 NY
## 2168 NY
## 2169 NY
## 2170 AK
# Find the beer with the largest ABV and IBU and return those row for question #5
# The beer with the highest ABV is the Lee Hill Series Vol. 5 – Belgian Style Quadrupel Ale at 12.8% ABV from the state of Colorado
# The beer with the highest IBU is the Bitter Bitch Imperial IPA with an ABU of 138 from Oregon
"Highest ABV:"
## [1] "Highest ABV:"
max(mergedBeerData$ABV, na.rm = TRUE)
## [1] 0.128
"Highest IBU:"
## [1] "Highest IBU:"
max(mergedBeerData$IBU, na.rm = TRUE)
## [1] 138
mergedBeerData[which.max(mergedBeerData$ABV),]
## Brewery_id Name.x Beer_ID
## 343 52 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale 2565
## ABV IBU Style Ounces Name.y City State
## 343 0.128 35 Quadrupel (Quad) 19.2 Upslope Brewing Company Boulder CO
mergedBeerData[which.max(mergedBeerData$IBU),]
## Brewery_id Name.x Beer_ID ABV IBU
## 1671 375 Bitter Bitch Imperial IPA 980 0.082 138
## Style Ounces Name.y City
## 1671 American Double / Imperial IPA 12 Astoria Brewing Company Astoria
## State
## 1671 OR
# create histogram of the number of breweries in each state for question #1
# Colorado has the most breweries followed by California
BreweriesData %>% ggplot(aes(x=fct_infreq(State))) +
geom_bar(stat = "count") +
labs(title="Number of Breweries in each State",
y="Count",
x="State") +
theme(axis.text.x = element_text(size = rel(0.8), angle = 90))

# Show a map of the US to show how many breweries are in each state with plotly for question #1
# The coastal states tend to have more breweries than inland states. States know to have high populations tend to have more breweries.
g = list(scope = 'usa')
fig = plot_geo(BreweriesData, locationmode = 'USA-states') %>%
add_trace(
colors = 'purples'
) %>%
colorbar(title = "Number of breweries in each state") %>%
layout(geo = g)
## No scattergeo mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: Didn't find a colorbar to modify.
# Creates a map of the US to show count of how many breweries are in each state;
states_map <- map_data("state")
map_data1 = BreweriesData %>% count(State)
map_data1$State = state.name[match(str_squish(map_data1$State), state.abb)]
map_data1$State = tolower(map_data1$State)
ggplot(map_data1, aes(map_id = State)) +
geom_map(aes(fill = n), map = states_map) +
expand_limits(x = states_map$long, y = states_map$lat) +
labs(title = "Count of Breweries in the US")+
ggthemes::theme_map() +
scico::scale_fill_scico(palette = "berlin")

# Compute the median alcohol content and international bitterness unit for each state then plot with a bar chart for question number 4
# States with a higher number of breweries tend to be towards the middle of the rankings for ABV and IBU.
# D.C has the highest median ABV and Maine has the highest IBU
stateMedianABV = mergedBeerData %>% filter(!is.na(ABV)) %>% group_by(State) %>% summarise(Median = median(ABV)) %>% arrange(desc(Median))
stateMedianIBU = mergedBeerData %>% filter(!is.na(IBU)) %>% group_by(State) %>% summarise(Median = median(IBU)) %>% arrange(desc(Median))
stateNumberOfBeersABV = mergedBeerData %>% filter(!is.na(ABV)) %>% group_by(State) %>% tally()
stateNumberOfBeersIBU = mergedBeerData %>% filter(!is.na(IBU)) %>% group_by(State) %>% tally()
stateMedianABV = merge(stateMedianABV, stateNumberOfBeersABV, by = "State")
stateMedianIBU = merge(stateMedianIBU, stateNumberOfBeersIBU, by = "State")
#print out median ABV and/or IBU by state
#stateMedianABV
#stateMedianIBU
stateMedianABV %>% ggplot(aes(x = reorder(State, -Median), y = Median)) +
geom_bar(stat = "identity", aes(fill = n)) +
scico::scale_fill_scico(palette = "lajolla") +
theme(axis.text.x = element_text(size = rel(0.8), angle = 90)) +
labs(title="Median ABV by State",
y="Median ABV",
x="State",
fill = "# Beers")

stateMedianIBU %>% ggplot(aes(x = reorder(State, -Median), y = Median)) +
geom_bar(stat = "identity", aes(fill = n)) +
scico::scale_fill_scico(palette = "lajolla") +
theme(axis.text.x = element_text(size = rel(0.8), angle = 90)) +
labs(title="Median IBU by State",
y="Median IBU",
x="State",
fill = "# Beers")

# Summary statistics for ABV variable for question #6
# The mean ABV is higher than the median ABV. ABV ranges from 1 to 12.8 percent
summary(mergedBeerData %>% filter(!is.na(ABV)))
## Brewery_id Name.x Beer_ID ABV
## Min. : 1.00 Length:2110 Min. : 4.0 Min. :0.00100
## 1st Qu.: 94.25 Class :character 1st Qu.: 951.2 1st Qu.:0.05000
## Median :210.00 Mode :character Median :1538.5 Median :0.05600
## Mean :232.21 Mean :1499.0 Mean :0.05985
## 3rd Qu.:366.00 3rd Qu.:2107.8 3rd Qu.:0.06800
## Max. :558.00 Max. :2692.0 Max. :0.12800
##
## IBU Style Ounces
## Min. : 4.00 American IPA : 377 Min. : 8.40
## 1st Qu.: 22.00 American Pale Ale (APA) : 205 1st Qu.:12.00
## Median : 35.00 American Amber / Red Ale : 110 Median :12.00
## Mean : 42.67 American Blonde Ale : 94 Mean :13.61
## 3rd Qu.: 62.00 American Double / Imperial IPA: 91 3rd Qu.:16.00
## Max. :138.00 American Pale Wheat Ale : 69 Max. :32.00
## NA's :889 (Other) :1164
## Name.y City State
## Length:2110 Length:2110 CO : 213
## Class :character Class :character CA : 158
## Mode :character Mode :character MI : 143
## IN : 126
## TX : 112
## PA : 96
## (Other):1262
# Boxplot of ABV for question #6
# This shows a right skew of the ABV data
scatterPlotABV = ggplot(data = mergedBeerData %>% filter(!is.na(ABV)), aes(x=ABV)) +
geom_boxplot() +
ylim(-1,1) +
labs(title = "ABV Distrobution", x = "ABV") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank())
plot(scatterPlotABV)

#ggplotly(scatterPlotABV)
# Distribution of the ABV variable for question #
# This shows a right skew of the ABV data
barPlotABV <- ggplot(data = mergedBeerData %>% filter(!is.na(ABV)), aes(x=ABV)) +
geom_bar(stat = "count") +
labs(title="Distribution of ABV",
y="Count",
x="ABV")
ggplotly(barPlotABV)
# Scatter plot between IBU and ABV for question #7
# There is a plausible relationship between the ABV and IBU.
# While ABV values are low we see a general trend of low IBUs, and on the flipside, when ABV values are high, we see a corresponding trend of high IBUs.
# Naturally, there are outliers to this data, which are very likely contributing to the secondary curve in the graph and as previously mentioned, but generally this is a very plausible relationship.
scatterPlotIBUvsABV = mergedBeerData %>% filter(!is.na(ABV) & !is.na(IBU)) %>% ggplot(aes(x=ABV, y=IBU)) +
geom_point(position = "jitter") +
geom_smooth(method = "loess", se = F) +
labs(title = "Relationship between ABV and IBU",
y = "IBU",
x = "ABV")
#plot(scatterPlotIBUvsABV)
#ggplotly(scatterPlotIBUvsABV)
#ggMarginal(scatterPlotIBUvsABV, type = "histogram")
#ggMarginal(scatterPlotIBUvsABV, type = "boxplot")
ggMarginal(scatterPlotIBUvsABV)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# Get list of all unique styles of beers and the number of beers of each style
# For question/item 8 and 9
# Only keep beers with a Style, ABV and IBU
uniqueStyles = mergedBeerData %>% filter(!is.na(Style))
# Create dataframe that contains every unique style and how many beers have that style
uniqueStyles = uniqueStyles %>% group_by(Style) %>% tally()
uniqueStyles
## # A tibble: 100 × 2
## Style n
## <fct> <int>
## 1 "" 5
## 2 "Abbey Single Ale" 2
## 3 "Altbier" 12
## 4 "American Adjunct Lager" 12
## 5 "American Amber / Red Ale" 118
## 6 "American Amber / Red Lager" 25
## 7 "American Barleywine" 3
## 8 "American Black Ale" 32
## 9 "American Blonde Ale" 97
## 10 "American Brown Ale" 65
## # … with 90 more rows
# Prepare data for knn modeling to predict style based off of ABV and IBU for question #8
# We can see that a K value of 6 yields the best accuracy at % 86.15960
# Only keep beers with a Style, ABV and IBU
AbvIbuBeer = mergedBeerData %>% filter(!is.na(ABV) & !is.na(IBU) & !is.na(Style))
# Set the style column to numeric to more easily manipulate the values
AbvIbuBeer$Style = as.character(AbvIbuBeer$Style)
# Keep only beers with a style containing string "IPA" or "Ale"
AbvIbuBeer = AbvIbuBeer[str_detect(AbvIbuBeer$Style, "IPA|Ale"),]
# If a syle doesn't contain the "IPA" but does contain "Ale" then change style to "Other"
AbvIbuBeer$Style[AbvIbuBeer$Style != grepl("IPA", AbvIbuBeer$Style, fixed = TRUE) & grepl("Ale", AbvIbuBeer$Style, fixed = TRUE)] = "Other"
# Figure out how to change any style containing "IPA" to type "IPA"
AbvIbuBeer$Style[grepl("IPA", AbvIbuBeer$Style, fixed = TRUE)] = "IPA"
# Only keep beers with style 'IPA' or 'Other'
AbvIbuBeer = AbvIbuBeer %>% filter(Style == "IPA" | Style == "Other")
AbvIbuBeer$Style = as.factor(AbvIbuBeer$Style)
# Drop all levels except for 'IPA' and 'Other'
AbvIbuBeer$Style = droplevels(AbvIbuBeer$Style)
table(AbvIbuBeer$Style)
##
## IPA Other
## 340 462
# Model relationship between ABV and IBU with knn
maxKvalue = 100
accuracyVector = c(maxKvalue)
for(i in 1:maxKvalue){
classifications = knn.cv(AbvIbuBeer[,c(4,5)], AbvIbuBeer$Style, k=i, prob = TRUE)
accuracyVector[i] = confusionMatrix(table(classifications, AbvIbuBeer$Style))$overall[1]
}
higestAccuracyKValue = which.max(accuracyVector)
higestAccuracyKValue
## [1] 7
accuracyVector
## [1] 0.7992519 0.8029925 0.8216958 0.8254364 0.8491272 0.8541147 0.8603491
## [8] 0.8566085 0.8516209 0.8516209 0.8528678 0.8528678 0.8491272 0.8466334
## [15] 0.8491272 0.8441397 0.8428928 0.8528678 0.8553616 0.8553616 0.8553616
## [22] 0.8553616 0.8553616 0.8553616 0.8553616 0.8553616 0.8553616 0.8553616
## [29] 0.8553616 0.8553616 0.8553616 0.8516209 0.8528678 0.8503741 0.8503741
## [36] 0.8478803 0.8541147 0.8541147 0.8541147 0.8541147 0.8541147 0.8541147
## [43] 0.8541147 0.8541147 0.8541147 0.8541147 0.8541147 0.8541147 0.8541147
## [50] 0.8541147 0.8541147 0.8541147 0.8541147 0.8541147 0.8528678 0.8528678
## [57] 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678
## [64] 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678
## [71] 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678
## [78] 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678
## [85] 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678
## [92] 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678 0.8528678
## [99] 0.8528678 0.8528678
# Plot Knn Accuracy and IBU/ABV by Style for question #8
plot(seq(1, maxKvalue, 1), accuracyVector, type = "l", xlab = "K value", ylab = "accuracy", main = "accuracy for k values")

AbvIbuBeer %>% ggplot(aes(x = ABV, y = IBU, color = Style)) +
geom_jitter() +
labs(title="IBU vs ABV for IPA's and Other Ales")

# Question #9
# The mean beer. Find the mean ABV, IBU, most common number of ounces, and most common style of beer
# Name: "Average Beer"
# Calculate the mean ABV/IBU
ABVBeer = mergedBeerData %>% filter(!is.na(ABV))
IBUBeer = mergedBeerData %>% filter(!is.na(IBU))
# with a count of 384 the American IPA is the most common style of beer
# most common ounces is 12
"Mean ABV:"
## [1] "Mean ABV:"
mean(ABVBeer$ABV)
## [1] 0.05985071
"Mean IBU:"
## [1] "Mean IBU:"
mean(IBUBeer$IBU)
## [1] 42.67322
# We could name the most "average" beer in the dataset:
avgBeer = mergedBeerData %>% filter(IBU == 42 & ABV == 0.059)
avgBeer
## Brewery_id Name.x Beer_ID ABV IBU Style
## 1 235 Harpoon IPA 1379 0.059 42 American IPA
## 2 269 Long Trail IPA 1926 0.059 42 English India Pale Ale (IPA)
## Ounces Name.y City State
## 1 12 Harpoon Brewery Boston MA
## 2 12 Long Trail Brewing Company Bridgewater Corners VT